# Silence convergence warnings from some alpha testings
import warnings
warnings.filterwarnings('ignore')
# Standard data work / math ops
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA as PCA
# For plotting
import matplotlib.pyplot as plt
import matplotlib
from matplotlib import cm
from matplotlib.ticker import LinearLocator
import seaborn as sb
# Linear regression and metrics
import sklearn.linear_model as LM
import sklearn.metrics as metrics
# Feature and model selection
import sklearn.model_selection as MS
import sklearn.feature_selection as FS
Define PCA transform to keep PC's accounting for 95% of data variance
def PCATransform(data):
pca = PCA()
pca.fit(data)
toKeep = 0
totalVar = 0
for var in pca.explained_variance_ratio_:
if totalVar < .95:
totalVar += var
toKeep += 1
else:
break
print("Top {} components capture {:.2f}% of the data".format(toKeep,totalVar*100))
outData = PCA(n_components = toKeep).fit_transform(data)
outData = pd.DataFrame(data=outData)
print("{} -> {}".format(data.shape,outData.shape))
return outData
Popular data will be seperated from raw data. As regression will be used to predict popular chart rankings, all models will be run on popular data only. Datasets used will be all popular data, and the 5 genre breakdowns.
The following data variants will be created per dataset:
# Initial data import
rawData = pd.read_csv("Data/rawData_final.csv")
popData = {}
# Split popular data from raw and seperate chartrank (target)
popData['raw'] = rawData[rawData.popular == 1].drop('popular', axis=1)
popData['all_target'] = popData['raw'].chartrank
popData['all'] = popData['raw'].drop('chartrank',axis=1)
popData['all'] = pd.get_dummies(popData['all'], columns=['genre'])
Pull 3 most recent years: 2018, 2019, 2020
recentYear = 2018
popData['recent'] = popData['all'][popData['all'].year >= recentYear]
popData['recent'] = pd.get_dummies(popData['recent'], columns=['year'])
popData['recent_target'] = popData['all_target'][popData['recent'].index]
Get dummy vars for years
popData['all'] = pd.get_dummies(popData['all'], columns=['year'])
Drop duration
popData['nodur'] = popData['all'].drop('duration_ms',axis=1)
PCA on continuous, with original dummies
dummies = ['time_signature','explicit','key','mode','year','genre']
continuousVars = [x for x in popData['all'].columns]
for dummy in dummies:
continuousVars = [x for x in continuousVars if dummy not in x]
print("Continuous variables:")
for var in continuousVars:
print(" ",var)
print()
dummyVars = [x for x in popData['all'].columns if x not in continuousVars]
print('-------Popular PCA-------')
popData['pca'] = PCATransform(popData['all'][continuousVars])
popData['pca'] = pd.concat([popData['pca'], popData['all'][dummyVars].reset_index(drop=True)], axis=1)
print()
popData['pca'].info()
Continuous variables: acousticness danceability energy duration_ms instrumentalness valence tempo liveness loudness speechiness -------Popular PCA------- Top 7 components capture 96.04% of the data (5654, 10) -> (5654, 7) <class 'pandas.core.frame.DataFrame'> RangeIndex: 5654 entries, 0 to 5653 Data columns (total 51 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 0 5654 non-null float64 1 1 5654 non-null float64 2 2 5654 non-null float64 3 3 5654 non-null float64 4 4 5654 non-null float64 5 5 5654 non-null float64 6 6 5654 non-null float64 7 key_0 5654 non-null int64 8 key_1 5654 non-null int64 9 key_2 5654 non-null int64 10 key_3 5654 non-null int64 11 key_4 5654 non-null int64 12 key_5 5654 non-null int64 13 key_6 5654 non-null int64 14 key_7 5654 non-null int64 15 key_8 5654 non-null int64 16 key_9 5654 non-null int64 17 key_10 5654 non-null int64 18 key_11 5654 non-null int64 19 explicit_0 5654 non-null int64 20 explicit_1 5654 non-null int64 21 mode_0 5654 non-null int64 22 mode_1 5654 non-null int64 23 time_signature_1 5654 non-null int64 24 time_signature_3 5654 non-null int64 25 time_signature_4 5654 non-null int64 26 time_signature_5 5654 non-null int64 27 genre_country 5654 non-null uint8 28 genre_jazz 5654 non-null uint8 29 genre_latin 5654 non-null uint8 30 genre_pop 5654 non-null uint8 31 genre_r&b 5654 non-null uint8 32 year_2002 5654 non-null uint8 33 year_2003 5654 non-null uint8 34 year_2004 5654 non-null uint8 35 year_2005 5654 non-null uint8 36 year_2006 5654 non-null uint8 37 year_2007 5654 non-null uint8 38 year_2008 5654 non-null uint8 39 year_2009 5654 non-null uint8 40 year_2010 5654 non-null uint8 41 year_2011 5654 non-null uint8 42 year_2012 5654 non-null uint8 43 year_2013 5654 non-null uint8 44 year_2014 5654 non-null uint8 45 year_2015 5654 non-null uint8 46 year_2016 5654 non-null uint8 47 year_2017 5654 non-null uint8 48 year_2018 5654 non-null uint8 49 year_2019 5654 non-null uint8 50 year_2020 5654 non-null uint8 dtypes: float64(7), int64(20), uint8(24) memory usage: 1.3 MB
Split by genre, create variants per
# Genre split
genres = np.unique(popData['raw'].genre.to_numpy())
popGenre = {}
for genre in genres:
raw = popData['raw'][popData['raw'].genre == genre]
target = raw.chartrank
gAll = raw.drop(['genre','chartrank'],axis=1)
# Recent 3 years
recent = gAll[gAll.year >= recentYear]
recent = pd.get_dummies(recent, columns=['year'])
recent_target = target[recent.index]
# Dummy year
gAll = pd.get_dummies(gAll, columns=['year'])
# Drop duration
nodur = gAll.drop('duration_ms',axis=1)
dummyVars = [x for x in dummyVars if x in gAll.columns]
# PCA
print('-------{} PCA-------'.format(genre))
pca = PCATransform(gAll[continuousVars])
pca = pd.concat([pca, gAll[dummyVars].reset_index(drop=True)], axis=1)
print()
popGenre[genre] = {"all":gAll,"all_target":target,"recent":recent,"recent_target":recent_target,"nodur":nodur,"pca":pca}
-------country PCA------- Top 5 components capture 95.62% of the data (1484, 10) -> (1484, 5) -------jazz PCA------- Top 6 components capture 96.62% of the data (627, 10) -> (627, 6) -------latin PCA------- Top 6 components capture 97.00% of the data (1198, 10) -> (1198, 6) -------pop PCA------- Top 7 components capture 96.72% of the data (617, 10) -> (617, 7) -------r&b PCA------- Top 7 components capture 97.91% of the data (1728, 10) -> (1728, 7)
subplots = 1
def plotCorrMat(title, corMat, save=True):
fig = plt.figure(figsize=(15,15),dpi=200)
sb.heatmap(corMat, annot = False)
plt.title("{} Corr Matrix".format(title))
if save:
plt.savefig("Images/{}_CorrMat.png".format(title))
plt.show()
plotCorrMat("All",popData['raw'].corr())
for genre in genres:
plotCorrMat(genre + ' all', popGenre[genre]['all'].corr())
plotCorrMat(genre + ' recent', popGenre[genre]['recent'].corr())
There are no notable differences in correlations between the popular dataset and the dataset as a whole
The remaining portion of the notebook will be evaluating linear regression models to predict chart rank for a given popular song. Chart rank will be treated as continous in this context, so evaluation of the model will have some leniency in its result when considering accuracy.
Define 'Model' class: Holds a linear regression model object from sklearn, paramater list to explore, and gridsearch object. Class is used to more efficiently test models on changing data sets.
Functions enable 2D graphing of alpha parameter for Ridge and Lasso models, as well as 3D graphing of l1_ratio/alpha for for Stochastic Gradient Descent (SGD) model
class Model:
def __init__(self, name, model, paramList):
self.name = name
self.model = model
self.paramList = paramList
self.gs = MS.GridSearchCV(model, paramList, cv=10, n_jobs = -1)
# Cacluate RMSE value
def RMSE(self,data,target):
return np.sqrt(np.mean((self.model.predict(data) - target)**2))
# Fit the model with given data. If finding best fit, use k fold value and scoring method to find fit and set
def fit(self,data,target,k,findBestFit,score_method):
if findBestFit:
self.gs.cv = k if k > 1 else 10
self.gs.scoring = score_method
self.gs.fit(data,target)
self.model = self.gs.best_estimator_
else:
self.model.fit(data,target)
# Graph 2D plots
def graph(self):
for i,param in enumerate(self.paramList):
paramScores = []
for j,score in enumerate(self.gs.cv_results_['mean_test_score']):
if "neg" in self.gs.scoring:
score = -score
paramVal = self.gs.cv_results_['param_'+param][j]
paramScores.append([paramVal,score])
plt.plot([x for x,y in paramScores],[y for x,y in paramScores])
plt.xlabel("{} values".format(param))
plt.ylabel("Training cross validation RMSE")
plt.title("{} mean RMSE vs {} using {}-fold cross validation".format(self.name,param,self.gs.cv))
plt.show()
# Sub function for graph3D - append results from grid search to an array of x,y,z points
def computeBarPoints(self):
# Compute bar plot arrays for ratio and alpha parameters
points = []
for i,paramInputs in enumerate(self.gs.cv_results_['params']):
paramResult = []
for param in paramInputs:
paramResult.append(paramInputs[param])
paramResult.append(-self.gs.cv_results_['mean_test_score'][i])
points.append(paramResult)
return np.array(points)
# Graph 3D plots - used for SGD l1_ratio/alpha params
def graph3D(self,k):
# Get data as 3D points
points = self.computeBarPoints()
# Update font size for plots
storedSettings = matplotlib.rcParams
matplotlib.rcParams.update({'font.size': 8})
fig = plt.figure(figsize=(20,10),dpi=100)
ax0 = fig.add_subplot(1,2,1, projection='3d')
ax1 = fig.add_subplot(1,2,2, projection='3d')
ys = np.unique(points[:,1])
xs = np.unique(points[:,0])
zs = []
# Calculate and store min/max of z - used to offset graph to better show variance in values since they are typically very close, but the overall graph scale is large
zmax = max(points[:,2])
zmin = min(points[:,2]) * .998 # lower min to avoid hiding lowest z coordinate by it being 0 after subtracting min
# Bar width based on graph scale to be viewable to some degree
width = (max(xs) - min(xs)) * 1/len(xs) * .25
# Order z values to ys
for i,y in enumerate(ys):
zs.append(points[points[:,1] == y][:,2])
# Colors
cs = ['r','g','b','purple','gray']
for i,y in enumerate(ys):
# Random colors
#c = np.random.rand(3,)
# Repeating preset colors
c = cs[i % len(cs)]
_ = ax0.bar3d(xs, y, 0, dx=width, dy = width, dz = zs[i]-zmin, color=c, alpha=0.7)
_ = ax1.bar3d(xs, y, 0, dx=width, dy = width, dz = zs[i]-zmin, color=c, alpha=0.7)
# Update ticks and labels to match changed scale based on subtracting zmin from all z values
ticks = np.linspace(0,zmax-zmin,10)
tickLabels = []
for tick in ticks:
tickLabels.append(str(np.round(tick+zmin,3)))
for i,ax in enumerate([ax0,ax1]):
xsTick = np.concatenate((xs[xs<1][1::2],xs[xs>=1]))
ysTick = ys[1::2]
_ = ax.axes.set_zticks(ticks)
_ = ax.axes.set_zticklabels(tickLabels)
_ = ax.axes.set_yticks(ysTick)
_ = ax.axes.set_xticks(xsTick)
_ = ax.set_yticklabels(np.round(ysTick,2),rotation=(-20 if i==0 else 20), va='center', ha='center')
_ = ax.set_xticklabels(np.round(xsTick,2),rotation=(35 if i==0 else -35), va='center', ha='center')
_ = ax.axes.set_xlabel("Alpha")
_ = ax.axes.set_ylabel("L1 Ratio")
_ = ax.axes.set_zlabel("Mean RMSE")
_ = ax.axes.set_title("{} mean RMSE vs alpha and l1_ratio using {}-fold cross validation (View angle: {})".format(self.name,k,125 if i==0 else -125))
ax0.zaxis._axinfo['label']['ha'] = 'left'
ax1.zaxis._axinfo['label']['ha'] = 'right'
# Plots are same graph, rotated in opposite directions to enable better view of result
ax0.view_init(30, 125)
ax1.view_init(30,-125)
plt.show(fig)
matplotlib.rcParams.update(storedSettings)
# Model evaluation and output
def evaluate(self,trainData,trainTarget,testData,testTarget,k=1,score_method='neg_root_mean_squared_error',findBestFit=False,graph=False):
# Fit model on given data (best or default vals based on bool)
self.fit(trainData,trainTarget,k,findBestFit,score_method)
trainRMSE = self.RMSE(trainData,trainTarget)
testRMSE = self.RMSE(testData,testTarget)
# If not using grid search, perform kfold cross validation for accuracy measurement
if not findBestFit and k > 1:
scores = MS.cross_val_score(self.model, trainData, trainTarget, cv=k, scoring=score_method)
mean = scores.mean() if scores.mean() > 0 else -scores.mean()
print("Default {} mean RMSE from {}-fold cross validation was {:.3f}".format(self.name,k,mean))
if not findBestFit:
print("Default {} RMSE on training data was {:.3f}".format(self.name,trainRMSE))
print("Default {} R^2 score was {:.5f}".format(self.name,self.model.score(trainData,trainTarget)))
print("Default {} RMSE on testing data was {:.3f}\n".format(self.name,testRMSE))
else:
print("Best fit {} RMSE on training data was {:.3f}".format(self.name,trainRMSE))
print("Best fit {} R^2 score was {:.5f}".format(self.name,self.model.score(trainData,trainTarget)))
print("Best fit {} RMSE on testing data was {:.3f}".format(self.name,testRMSE))
print("Best params used:\n{}\n".format(self.gs.best_params_))
if graph and findBestFit:
try:
if self.model.penalty=='elasticnet':
self.graph3D(k if k > 1 else 10)
return
except:
pass
self.graph()
Define constructor method to build model listings and begin evaluation
def constructModels(data,dataTitle='',k=10,R_pList=None,L_pList=None,SGD_pList=None,graph=True):
if not len(data)==4:
return
trainData, testData, trainTarget, testTarget = data
# If not provided, set paramaters for ridge, lasso, SGD gridsearch
if R_pList is None:
alphas = np.append(np.linspace(.0001, 1, 30, False),np.linspace(1, 20, 30))
R_pList = {'alpha':alphas}
if L_pList is None:
alphas = np.append(np.linspace(.0001, 1, 30, False),np.linspace(1, 2, 4))
L_pList = {'alpha':alphas}
if SGD_pList is None:
ratios = np.linspace(0,1,30)
alphas = np.append(np.linspace(.0001, 1, 30, False),np.linspace(1, 2, 4))
SGD_pList = {'alpha':alphas,'l1_ratio':ratios}
# Define models
models = [Model("Ridge", LM.Ridge(), R_pList), Model("Lasso", LM.Lasso(), L_pList), Model("SGD", LM.SGDRegressor(penalty='elasticnet'), SGD_pList)]
# Print header
if not dataTitle=='':
print("------{}------\n------Regression model results\n".format(dataTitle))
# Evaluate each model defaultly and finding best fit
for model in models:
print("----------{} model----------".format(model.name))
# Eval with default params and cross validation
model.evaluate(trainData,trainTarget,testData,testTarget,k=k)
# Eval with optimal params and cross validation
model.evaluate(trainData,trainTarget,testData,testTarget,k=k,findBestFit=True,graph=graph)
return models
Ridge, Lasso, and SGD models will be built and tested on the previously noted subsets/variations of the data to identify the best predictor possible.
Every model will be run on the full datasets, as well as the recent datasets, but of the remaining variants, only the non-genre split variants will be run unless a noticible improvement is observed in the resulting model. In which case the variant will also be run per genre.
All data pulled across all genres is used.
Overall performace is expected to be low. It is assumed the data is semi-clustered by genre and grouping everything together will cause outlier songs to throw model off. Model performance is expected to be somewhat poor.
title='All popular data, 80/20 random split'
data = popData['all']
target = popData['all_target']
models_All = constructModels(MS.train_test_split(data, target, test_size=0.2, random_state=17),dataTitle=title)
------All popular data, 80/20 random split------
------Regression model results
----------Ridge model----------
Default Ridge mean RMSE from 10-fold cross validation was 25.289
Default Ridge RMSE on training data was 25.028
Default Ridge R^2 score was 0.15392
Default Ridge RMSE on testing data was 25.516
Best fit Ridge RMSE on training data was 25.042
Best fit Ridge R^2 score was 0.15297
Best fit Ridge RMSE on testing data was 25.544
Best params used:
{'alpha': 20.0}
----------Lasso model----------
Default Lasso mean RMSE from 10-fold cross validation was 25.961
Default Lasso RMSE on training data was 25.955
Default Lasso R^2 score was 0.09012
Default Lasso RMSE on testing data was 26.578
Best fit Lasso RMSE on training data was 25.061
Best fit Lasso R^2 score was 0.15168
Best fit Lasso RMSE on testing data was 25.571
Best params used:
{'alpha': 0.03343}
----------SGD model----------
Default SGD mean RMSE from 10-fold cross validation was 25.277
Default SGD RMSE on training data was 25.075
Default SGD R^2 score was 0.15073
Default SGD RMSE on testing data was 25.534
Best fit SGD RMSE on training data was 25.097
Best fit SGD R^2 score was 0.14930
Best fit SGD RMSE on testing data was 25.624
Best params used:
{'alpha': 0.03343, 'l1_ratio': 0.896551724137931}
Not grouping the data has, predicitably, terrible results. The average RMSE from the models is around 25.5, i.e., 25.5% of the max value of the chart rankings. If there are strong trends within the genres, keeping the data all together like this would produce poor results as many songs would be treated as outliers, whereas within thier own genre, they might not be.
title='All popular data, recent 3 years, 80/20 random split'
data = popData['recent']
target = popData['recent_target']
models_recent = constructModels(MS.train_test_split(data, target, test_size=0.2, random_state=17),dataTitle=title)
------All popular data, recent 3 years, 80/20 random split------
------Regression model results
----------Ridge model----------
Default Ridge mean RMSE from 10-fold cross validation was 25.593
Default Ridge RMSE on training data was 24.763
Default Ridge R^2 score was 0.21355
Default Ridge RMSE on testing data was 28.525
Best fit Ridge RMSE on training data was 24.865
Best fit Ridge R^2 score was 0.20705
Best fit Ridge RMSE on testing data was 28.409
Best params used:
{'alpha': 14.758620689655173}
----------Lasso model----------
Default Lasso mean RMSE from 10-fold cross validation was 25.903
Default Lasso RMSE on training data was 25.818
Default Lasso R^2 score was 0.14508
Default Lasso RMSE on testing data was 28.058
Best fit Lasso RMSE on training data was 24.825
Best fit Lasso R^2 score was 0.20960
Best fit Lasso RMSE on testing data was 28.425
Best params used:
{'alpha': 0.06676}
----------SGD model----------
Default SGD mean RMSE from 10-fold cross validation was 25.553
Default SGD RMSE on training data was 24.930
Default SGD R^2 score was 0.20290
Default SGD RMSE on testing data was 28.482
Best fit SGD RMSE on training data was 25.045
Best fit SGD R^2 score was 0.19553
Best fit SGD RMSE on testing data was 28.531
Best params used:
{'alpha': 0.13341999999999998, 'l1_ratio': 1.0}
The recent 3 years dataset has produced an average 3% decrease in accuracy for the final models, although cross validation accuracies were on average a little better. As with all data above, if strong trends existed it is assumed they would be prevelent in the genre breakdowns, and grouping the data together would result in a poorer model.
title='All popular data, no duration, 80/20 random split'
data = popData['nodur']
target = popData['all_target']
models_nodur = constructModels(MS.train_test_split(data, target, test_size=0.2, random_state=17),dataTitle=title)
------All popular data, no duration, 80/20 random split------
------Regression model results
----------Ridge model----------
Default Ridge mean RMSE from 10-fold cross validation was 25.289
Default Ridge RMSE on training data was 25.030
Default Ridge R^2 score was 0.15381
Default Ridge RMSE on testing data was 25.521
Best fit Ridge RMSE on training data was 25.043
Best fit Ridge R^2 score was 0.15293
Best fit Ridge RMSE on testing data was 25.545
Best params used:
{'alpha': 20.0}
----------Lasso model----------
Default Lasso mean RMSE from 10-fold cross validation was 25.961
Default Lasso RMSE on training data was 25.955
Default Lasso R^2 score was 0.09012
Default Lasso RMSE on testing data was 26.578
Best fit Lasso RMSE on training data was 25.061
Best fit Lasso R^2 score was 0.15168
Best fit Lasso RMSE on testing data was 25.571
Best params used:
{'alpha': 0.03343}
----------SGD model----------
Default SGD mean RMSE from 10-fold cross validation was 25.289
Default SGD RMSE on training data was 25.049
Default SGD R^2 score was 0.15253
Default SGD RMSE on testing data was 25.544
Best fit SGD RMSE on training data was 25.061
Best fit SGD R^2 score was 0.15174
Best fit SGD RMSE on testing data was 25.538
Best params used:
{'alpha': 0.0001, 'l1_ratio': 0.8275862068965517}
Duration was removed to see if the classification issues we had with it would show up in our prediction models as well. The average RMSE calculated with duration is not noticably different from that of the original all data run, so duration does not seem to be having a large impact one way or another on our results.
title='All popular data after PCA, 80/20 random split'
data = popData['pca']
target = popData['all_target']
models_pca = constructModels(MS.train_test_split(data, target, test_size=0.2, random_state=17),dataTitle=title)
------All popular data after PCA, 80/20 random split------
------Regression model results
----------Ridge model----------
Default Ridge mean RMSE from 10-fold cross validation was 25.295
Default Ridge RMSE on training data was 25.047
Default Ridge R^2 score was 0.15264
Default Ridge RMSE on testing data was 25.529
Best fit Ridge RMSE on training data was 25.057
Best fit Ridge R^2 score was 0.15199
Best fit Ridge RMSE on testing data was 25.555
Best params used:
{'alpha': 20.0}
----------Lasso model----------
Default Lasso mean RMSE from 10-fold cross validation was 25.961
Default Lasso RMSE on training data was 25.955
Default Lasso R^2 score was 0.09012
Default Lasso RMSE on testing data was 26.578
Best fit Lasso RMSE on training data was 25.070
Best fit Lasso R^2 score was 0.15110
Best fit Lasso RMSE on testing data was 25.583
Best params used:
{'alpha': 0.03343}
----------SGD model----------
Default SGD mean RMSE from 10-fold cross validation was 25.294
Default SGD RMSE on training data was 25.073
Default SGD R^2 score was 0.15090
Default SGD RMSE on testing data was 25.551
Best fit SGD RMSE on training data was 25.098
Best fit SGD R^2 score was 0.14921
Best fit SGD RMSE on testing data was 25.642
Best params used:
{'alpha': 0.03343, 'l1_ratio': 0.8620689655172413}
PCA was performed to see if the variables had strong linear relations with one another that could be removed, and perhaps help with overfitting. The result is again not distinct from the original all data run.
title='All popular data after dropping correlated variables, 80/20 random split'
data = popData['all']
target = popData['all_target']
for k in range(30,len(popData['all'].columns)):
print('Results for {} features'.format(k))
data = FS.SelectKBest(score_func=FS.f_regression, k=k).fit_transform(popData['all'],popData['all_target'])
alphas = np.append(np.linspace(.1, 1, 10, False),np.linspace(1, 20, 20))
R_pList = {'alpha':alphas}
alphas = np.linspace(.0001, 1, 20)
L_pList = {'alpha':alphas}
ratios = np.linspace(0,1,10)
alphas = np.append(np.linspace(.0001, 1, 15, False),np.linspace(1, 2, 4))
SGD_pList = {'alpha':alphas,'l1_ratio':ratios}
models_nocor = constructModels(MS.train_test_split(data, target, test_size=0.2, random_state=17),dataTitle=title,R_pList=R_pList,L_pList=L_pList,SGD_pList=SGD_pList,graph=False)
Results for 30 features
------All popular data after dropping correlated variables, 80/20 random split------
------Regression model results
----------Ridge model----------
Default Ridge mean RMSE from 10-fold cross validation was 25.323
Default Ridge RMSE on training data was 25.178
Default Ridge R^2 score was 0.14376
Default Ridge RMSE on testing data was 25.662
Best fit Ridge RMSE on training data was 25.194
Best fit Ridge R^2 score was 0.14268
Best fit Ridge RMSE on testing data was 25.685
Best params used:
{'alpha': 20.0}
----------Lasso model----------
Default Lasso mean RMSE from 10-fold cross validation was 25.961
Default Lasso RMSE on training data was 25.955
Default Lasso R^2 score was 0.09012
Default Lasso RMSE on testing data was 26.578
Best fit Lasso RMSE on training data was 25.178
Best fit Lasso R^2 score was 0.14378
Best fit Lasso RMSE on testing data was 25.661
Best params used:
{'alpha': 0.0001}
----------SGD model----------
Default SGD mean RMSE from 10-fold cross validation was 25.292
Default SGD RMSE on training data was 25.189
Default SGD R^2 score was 0.14302
Default SGD RMSE on testing data was 25.660
Best fit SGD RMSE on training data was 25.197
Best fit SGD R^2 score was 0.14247
Best fit SGD RMSE on testing data was 25.684
Best params used:
{'alpha': 0.0001, 'l1_ratio': 1.0}
Results for 31 features
------All popular data after dropping correlated variables, 80/20 random split------
------Regression model results
----------Ridge model----------
Default Ridge mean RMSE from 10-fold cross validation was 25.327
Default Ridge RMSE on training data was 25.177
Default Ridge R^2 score was 0.14381
Default Ridge RMSE on testing data was 25.668
Best fit Ridge RMSE on training data was 25.193
Best fit Ridge R^2 score was 0.14273
Best fit Ridge RMSE on testing data was 25.690
Best params used:
{'alpha': 20.0}
----------Lasso model----------
Default Lasso mean RMSE from 10-fold cross validation was 25.961
Default Lasso RMSE on training data was 25.955
Default Lasso R^2 score was 0.09012
Default Lasso RMSE on testing data was 26.578
Best fit Lasso RMSE on training data was 25.177
Best fit Lasso R^2 score was 0.14384
Best fit Lasso RMSE on testing data was 25.666
Best params used:
{'alpha': 0.0001}
----------SGD model----------
Default SGD mean RMSE from 10-fold cross validation was 25.301
Default SGD RMSE on training data was 25.192
Default SGD R^2 score was 0.14279
Default SGD RMSE on testing data was 25.679
Best fit SGD RMSE on training data was 25.191
Best fit SGD R^2 score was 0.14286
Best fit SGD RMSE on testing data was 25.687
Best params used:
{'alpha': 0.0001, 'l1_ratio': 0.5555555555555556}
Results for 32 features
------All popular data after dropping correlated variables, 80/20 random split------
------Regression model results
----------Ridge model----------
Default Ridge mean RMSE from 10-fold cross validation was 25.320
Default Ridge RMSE on training data was 25.168
Default Ridge R^2 score was 0.14442
Default Ridge RMSE on testing data was 25.638
Best fit Ridge RMSE on training data was 25.185
Best fit Ridge R^2 score was 0.14331
Best fit Ridge RMSE on testing data was 25.663
Best params used:
{'alpha': 20.0}
----------Lasso model----------
Default Lasso mean RMSE from 10-fold cross validation was 25.961
Default Lasso RMSE on training data was 25.955
Default Lasso R^2 score was 0.09012
Default Lasso RMSE on testing data was 26.578
Best fit Lasso RMSE on training data was 25.168
Best fit Lasso R^2 score was 0.14445
Best fit Lasso RMSE on testing data was 25.636
Best params used:
{'alpha': 0.0001}
----------SGD model----------
Default SGD mean RMSE from 10-fold cross validation was 25.313
Default SGD RMSE on training data was 25.181
Default SGD R^2 score was 0.14359
Default SGD RMSE on testing data was 25.641
Best fit SGD RMSE on training data was 25.190
Best fit SGD R^2 score was 0.14297
Best fit SGD RMSE on testing data was 25.665
Best params used:
{'alpha': 0.0001, 'l1_ratio': 0.0}
Results for 33 features
------All popular data after dropping correlated variables, 80/20 random split------
------Regression model results
----------Ridge model----------
Default Ridge mean RMSE from 10-fold cross validation was 25.295
Default Ridge RMSE on training data was 25.141
Default Ridge R^2 score was 0.14626
Default Ridge RMSE on testing data was 25.637
Best fit Ridge RMSE on training data was 25.158
Best fit Ridge R^2 score was 0.14512
Best fit Ridge RMSE on testing data was 25.666
Best params used:
{'alpha': 20.0}
----------Lasso model----------
Default Lasso mean RMSE from 10-fold cross validation was 25.961
Default Lasso RMSE on training data was 25.955
Default Lasso R^2 score was 0.09012
Default Lasso RMSE on testing data was 26.578
Best fit Lasso RMSE on training data was 25.141
Best fit Lasso R^2 score was 0.14629
Best fit Lasso RMSE on testing data was 25.635
Best params used:
{'alpha': 0.0001}
----------SGD model----------
Default SGD mean RMSE from 10-fold cross validation was 25.275
Default SGD RMSE on training data was 25.156
Default SGD R^2 score was 0.14529
Default SGD RMSE on testing data was 25.661
Best fit SGD RMSE on training data was 25.197
Best fit SGD R^2 score was 0.14247
Best fit SGD RMSE on testing data was 25.716
Best params used:
{'alpha': 0.0001, 'l1_ratio': 0.4444444444444444}
Results for 34 features
------All popular data after dropping correlated variables, 80/20 random split------
------Regression model results
----------Ridge model----------
Default Ridge mean RMSE from 10-fold cross validation was 25.267
Default Ridge RMSE on training data was 25.115
Default Ridge R^2 score was 0.14806
Default Ridge RMSE on testing data was 25.615
Best fit Ridge RMSE on training data was 25.130
Best fit Ridge R^2 score was 0.14701
Best fit Ridge RMSE on testing data was 25.637
Best params used:
{'alpha': 15.0}
----------Lasso model----------
Default Lasso mean RMSE from 10-fold cross validation was 25.961
Default Lasso RMSE on training data was 25.955
Default Lasso R^2 score was 0.09012
Default Lasso RMSE on testing data was 26.578
Best fit Lasso RMSE on training data was 25.114
Best fit Lasso R^2 score was 0.14810
Best fit Lasso RMSE on testing data was 25.613
Best params used:
{'alpha': 0.0001}
----------SGD model----------
Default SGD mean RMSE from 10-fold cross validation was 25.261
Default SGD RMSE on training data was 25.147
Default SGD R^2 score was 0.14588
Default SGD RMSE on testing data was 25.635
Best fit SGD RMSE on training data was 25.168
Best fit SGD R^2 score was 0.14445
Best fit SGD RMSE on testing data was 25.651
Best params used:
{'alpha': 0.0001, 'l1_ratio': 0.4444444444444444}
Results for 35 features
------All popular data after dropping correlated variables, 80/20 random split------
------Regression model results
----------Ridge model----------
Default Ridge mean RMSE from 10-fold cross validation was 25.268
Default Ridge RMSE on training data was 25.111
Default Ridge R^2 score was 0.14830
Default Ridge RMSE on testing data was 25.606
Best fit Ridge RMSE on training data was 25.129
Best fit Ridge R^2 score was 0.14711
Best fit Ridge RMSE on testing data was 25.634
Best params used:
{'alpha': 20.0}
----------Lasso model----------
Default Lasso mean RMSE from 10-fold cross validation was 25.961
Default Lasso RMSE on training data was 25.955
Default Lasso R^2 score was 0.09012
Default Lasso RMSE on testing data was 26.578
Best fit Lasso RMSE on training data was 25.111
Best fit Lasso R^2 score was 0.14834
Best fit Lasso RMSE on testing data was 25.604
Best params used:
{'alpha': 0.0001}
----------SGD model----------
Default SGD mean RMSE from 10-fold cross validation was 25.276
Default SGD RMSE on training data was 25.141
Default SGD R^2 score was 0.14625
Default SGD RMSE on testing data was 25.637
Best fit SGD RMSE on training data was 25.182
Best fit SGD R^2 score was 0.14347
Best fit SGD RMSE on testing data was 25.643
Best params used:
{'alpha': 0.0001, 'l1_ratio': 0.7777777777777777}
Results for 36 features
------All popular data after dropping correlated variables, 80/20 random split------
------Regression model results
----------Ridge model----------
Default Ridge mean RMSE from 10-fold cross validation was 25.276
Default Ridge RMSE on training data was 25.111
Default Ridge R^2 score was 0.14830
Default Ridge RMSE on testing data was 25.606
Best fit Ridge RMSE on training data was 25.129
Best fit Ridge R^2 score was 0.14711
Best fit Ridge RMSE on testing data was 25.635
Best params used:
{'alpha': 20.0}
----------Lasso model----------
Default Lasso mean RMSE from 10-fold cross validation was 25.961
Default Lasso RMSE on training data was 25.955
Default Lasso R^2 score was 0.09012
Default Lasso RMSE on testing data was 26.578
Best fit Lasso RMSE on training data was 25.169
Best fit Lasso R^2 score was 0.14439
Best fit Lasso RMSE on testing data was 25.692
Best params used:
{'alpha': 0.05272631578947369}
----------SGD model----------
Default SGD mean RMSE from 10-fold cross validation was 25.307
Default SGD RMSE on training data was 25.152
Default SGD R^2 score was 0.14552
Default SGD RMSE on testing data was 25.626
Best fit SGD RMSE on training data was 25.142
Best fit SGD R^2 score was 0.14623
Best fit SGD RMSE on testing data was 25.629
Best params used:
{'alpha': 0.0001, 'l1_ratio': 0.5555555555555556}
Results for 37 features
------All popular data after dropping correlated variables, 80/20 random split------
------Regression model results
----------Ridge model----------
Default Ridge mean RMSE from 10-fold cross validation was 25.274
Default Ridge RMSE on training data was 25.107
Default Ridge R^2 score was 0.14860
Default Ridge RMSE on testing data was 25.597
Best fit Ridge RMSE on training data was 25.125
Best fit Ridge R^2 score was 0.14739
Best fit Ridge RMSE on testing data was 25.626
Best params used:
{'alpha': 20.0}
----------Lasso model----------
Default Lasso mean RMSE from 10-fold cross validation was 25.961
Default Lasso RMSE on training data was 25.955
Default Lasso R^2 score was 0.09012
Default Lasso RMSE on testing data was 26.578
Best fit Lasso RMSE on training data was 25.166
Best fit Lasso R^2 score was 0.14461
Best fit Lasso RMSE on testing data was 25.686
Best params used:
{'alpha': 0.05272631578947369}
----------SGD model----------
Default SGD mean RMSE from 10-fold cross validation was 25.295
Default SGD RMSE on training data was 25.144
Default SGD R^2 score was 0.14605
Default SGD RMSE on testing data was 25.621
Best fit SGD RMSE on training data was 25.143
Best fit SGD R^2 score was 0.14612
Best fit SGD RMSE on testing data was 25.640
Best params used:
{'alpha': 0.0001, 'l1_ratio': 0.4444444444444444}
Results for 38 features
------All popular data after dropping correlated variables, 80/20 random split------
------Regression model results
----------Ridge model----------
Default Ridge mean RMSE from 10-fold cross validation was 25.265
Default Ridge RMSE on training data was 25.095
Default Ridge R^2 score was 0.14938
Default Ridge RMSE on testing data was 25.586
Best fit Ridge RMSE on training data was 25.112
Best fit Ridge R^2 score was 0.14824
Best fit Ridge RMSE on testing data was 25.616
Best params used:
{'alpha': 19.0}
----------Lasso model----------
Default Lasso mean RMSE from 10-fold cross validation was 25.961
Default Lasso RMSE on training data was 25.955
Default Lasso R^2 score was 0.09012
Default Lasso RMSE on testing data was 26.578
Best fit Lasso RMSE on training data was 25.158
Best fit Lasso R^2 score was 0.14515
Best fit Lasso RMSE on testing data was 25.679
Best params used:
{'alpha': 0.05272631578947369}
----------SGD model----------
Default SGD mean RMSE from 10-fold cross validation was 25.291
Default SGD RMSE on training data was 25.136
Default SGD R^2 score was 0.14662
Default SGD RMSE on testing data was 25.621
Best fit SGD RMSE on training data was 25.159
Best fit SGD R^2 score was 0.14503
Best fit SGD RMSE on testing data was 25.616
Best params used:
{'alpha': 0.0001, 'l1_ratio': 0.3333333333333333}
Results for 39 features
------All popular data after dropping correlated variables, 80/20 random split------
------Regression model results
----------Ridge model----------
Default Ridge mean RMSE from 10-fold cross validation was 25.266
Default Ridge RMSE on training data was 25.091
Default Ridge R^2 score was 0.14970
Default Ridge RMSE on testing data was 25.557
Best fit Ridge RMSE on training data was 25.108
Best fit Ridge R^2 score was 0.14855
Best fit Ridge RMSE on testing data was 25.589
Best params used:
{'alpha': 19.0}
----------Lasso model----------
Default Lasso mean RMSE from 10-fold cross validation was 25.961
Default Lasso RMSE on training data was 25.955
Default Lasso R^2 score was 0.09012
Default Lasso RMSE on testing data was 26.578
Best fit Lasso RMSE on training data was 25.090
Best fit Lasso R^2 score was 0.14973
Best fit Lasso RMSE on testing data was 25.555
Best params used:
{'alpha': 0.0001}
----------SGD model----------
Default SGD mean RMSE from 10-fold cross validation was 25.262
Default SGD RMSE on training data was 25.126
Default SGD R^2 score was 0.14733
Default SGD RMSE on testing data was 25.608
Best fit SGD RMSE on training data was 25.109
Best fit SGD R^2 score was 0.14846
Best fit SGD RMSE on testing data was 25.579
Best params used:
{'alpha': 0.0001, 'l1_ratio': 0.0}
Results for 40 features
------All popular data after dropping correlated variables, 80/20 random split------
------Regression model results
----------Ridge model----------
Default Ridge mean RMSE from 10-fold cross validation was 25.271
Default Ridge RMSE on training data was 25.086
Default Ridge R^2 score was 0.14999
Default Ridge RMSE on testing data was 25.548
Best fit Ridge RMSE on training data was 25.104
Best fit Ridge R^2 score was 0.14877
Best fit Ridge RMSE on testing data was 25.584
Best params used:
{'alpha': 20.0}
----------Lasso model----------
Default Lasso mean RMSE from 10-fold cross validation was 25.961
Default Lasso RMSE on training data was 25.955
Default Lasso R^2 score was 0.09012
Default Lasso RMSE on testing data was 26.578
Best fit Lasso RMSE on training data was 25.086
Best fit Lasso R^2 score was 0.15002
Best fit Lasso RMSE on testing data was 25.546
Best params used:
{'alpha': 0.0001}
----------SGD model----------
Default SGD mean RMSE from 10-fold cross validation was 25.283
Default SGD RMSE on training data was 25.107
Default SGD R^2 score was 0.14857
Default SGD RMSE on testing data was 25.568
Best fit SGD RMSE on training data was 25.112
Best fit SGD R^2 score was 0.14826
Best fit SGD RMSE on testing data was 25.575
Best params used:
{'alpha': 0.0001, 'l1_ratio': 0.2222222222222222}
Results for 41 features
------All popular data after dropping correlated variables, 80/20 random split------
------Regression model results
----------Ridge model----------
Default Ridge mean RMSE from 10-fold cross validation was 25.277
Default Ridge RMSE on training data was 25.084
Default Ridge R^2 score was 0.15016
Default Ridge RMSE on testing data was 25.542
Best fit Ridge RMSE on training data was 25.102
Best fit Ridge R^2 score was 0.14895
Best fit Ridge RMSE on testing data was 25.576
Best params used:
{'alpha': 20.0}
----------Lasso model----------
Default Lasso mean RMSE from 10-fold cross validation was 25.961
Default Lasso RMSE on training data was 25.955
Default Lasso R^2 score was 0.09012
Default Lasso RMSE on testing data was 26.578
Best fit Lasso RMSE on training data was 25.083
Best fit Lasso R^2 score was 0.15019
Best fit Lasso RMSE on testing data was 25.540
Best params used:
{'alpha': 0.0001}
----------SGD model----------
Default SGD mean RMSE from 10-fold cross validation was 25.278
Default SGD RMSE on training data was 25.117
Default SGD R^2 score was 0.14789
Default SGD RMSE on testing data was 25.590
Best fit SGD RMSE on training data was 25.111
Best fit SGD R^2 score was 0.14829
Best fit SGD RMSE on testing data was 25.576
Best params used:
{'alpha': 0.0001, 'l1_ratio': 0.5555555555555556}
Results for 42 features
------All popular data after dropping correlated variables, 80/20 random split------
------Regression model results
----------Ridge model----------
Default Ridge mean RMSE from 10-fold cross validation was 25.277
Default Ridge RMSE on training data was 25.078
Default Ridge R^2 score was 0.15055
Default Ridge RMSE on testing data was 25.520
Best fit Ridge RMSE on training data was 25.096
Best fit Ridge R^2 score was 0.14933
Best fit Ridge RMSE on testing data was 25.556
Best params used:
{'alpha': 20.0}
----------Lasso model----------
Default Lasso mean RMSE from 10-fold cross validation was 25.961
Default Lasso RMSE on training data was 25.955
Default Lasso R^2 score was 0.09012
Default Lasso RMSE on testing data was 26.578
Best fit Lasso RMSE on training data was 25.078
Best fit Lasso R^2 score was 0.15058
Best fit Lasso RMSE on testing data was 25.518
Best params used:
{'alpha': 0.0001}
----------SGD model----------
Default SGD mean RMSE from 10-fold cross validation was 25.255
Default SGD RMSE on training data was 25.103
Default SGD R^2 score was 0.14886
Default SGD RMSE on testing data was 25.531
Best fit SGD RMSE on training data was 25.107
Best fit SGD R^2 score was 0.14856
Best fit SGD RMSE on testing data was 25.553
Best params used:
{'alpha': 0.0001, 'l1_ratio': 1.0}
Results for 43 features
------All popular data after dropping correlated variables, 80/20 random split------
------Regression model results
----------Ridge model----------
Default Ridge mean RMSE from 10-fold cross validation was 25.282
Default Ridge RMSE on training data was 25.077
Default Ridge R^2 score was 0.15064
Default Ridge RMSE on testing data was 25.527
Best fit Ridge RMSE on training data was 25.095
Best fit Ridge R^2 score was 0.14941
Best fit Ridge RMSE on testing data was 25.561
Best params used:
{'alpha': 20.0}
----------Lasso model----------
Default Lasso mean RMSE from 10-fold cross validation was 25.961
Default Lasso RMSE on training data was 25.955
Default Lasso R^2 score was 0.09012
Default Lasso RMSE on testing data was 26.578
Best fit Lasso RMSE on training data was 25.076
Best fit Lasso R^2 score was 0.15067
Best fit Lasso RMSE on testing data was 25.525
Best params used:
{'alpha': 0.0001}
----------SGD model----------
Default SGD mean RMSE from 10-fold cross validation was 25.293
Default SGD RMSE on training data was 25.117
Default SGD R^2 score was 0.14789
Default SGD RMSE on testing data was 25.588
Best fit SGD RMSE on training data was 25.097
Best fit SGD R^2 score was 0.14924
Best fit SGD RMSE on testing data was 25.546
Best params used:
{'alpha': 0.0001, 'l1_ratio': 0.7777777777777777}
Results for 44 features
------All popular data after dropping correlated variables, 80/20 random split------
------Regression model results
----------Ridge model----------
Default Ridge mean RMSE from 10-fold cross validation was 25.278
Default Ridge RMSE on training data was 25.069
Default Ridge R^2 score was 0.15114
Default Ridge RMSE on testing data was 25.535
Best fit Ridge RMSE on training data was 25.087
Best fit Ridge R^2 score was 0.14992
Best fit Ridge RMSE on testing data was 25.565
Best params used:
{'alpha': 20.0}
----------Lasso model----------
Default Lasso mean RMSE from 10-fold cross validation was 25.961
Default Lasso RMSE on training data was 25.955
Default Lasso R^2 score was 0.09012
Default Lasso RMSE on testing data was 26.578
Best fit Lasso RMSE on training data was 25.069
Best fit Lasso R^2 score was 0.15117
Best fit Lasso RMSE on testing data was 25.534
Best params used:
{'alpha': 0.0001}
----------SGD model----------
Default SGD mean RMSE from 10-fold cross validation was 25.284
Default SGD RMSE on training data was 25.116
Default SGD R^2 score was 0.14797
Default SGD RMSE on testing data was 25.595
Best fit SGD RMSE on training data was 25.098
Best fit SGD R^2 score was 0.14918
Best fit SGD RMSE on testing data was 25.568
Best params used:
{'alpha': 0.0001, 'l1_ratio': 1.0}
Results for 45 features
------All popular data after dropping correlated variables, 80/20 random split------
------Regression model results
----------Ridge model----------
Default Ridge mean RMSE from 10-fold cross validation was 25.286
Default Ridge RMSE on training data was 25.069
Default Ridge R^2 score was 0.15115
Default Ridge RMSE on testing data was 25.534
Best fit Ridge RMSE on training data was 25.087
Best fit Ridge R^2 score was 0.14993
Best fit Ridge RMSE on testing data was 25.564
Best params used:
{'alpha': 20.0}
----------Lasso model----------
Default Lasso mean RMSE from 10-fold cross validation was 25.961
Default Lasso RMSE on training data was 25.955
Default Lasso R^2 score was 0.09012
Default Lasso RMSE on testing data was 26.578
Best fit Lasso RMSE on training data was 25.141
Best fit Lasso R^2 score was 0.14626
Best fit Lasso RMSE on testing data was 25.645
Best params used:
{'alpha': 0.05272631578947369}
----------SGD model----------
Default SGD mean RMSE from 10-fold cross validation was 25.297
Default SGD RMSE on training data was 25.097
Default SGD R^2 score was 0.14929
Default SGD RMSE on testing data was 25.555
Best fit SGD RMSE on training data was 25.105
Best fit SGD R^2 score was 0.14871
Best fit SGD RMSE on testing data was 25.579
Best params used:
{'alpha': 0.0001, 'l1_ratio': 0.8888888888888888}
Results for 46 features
------All popular data after dropping correlated variables, 80/20 random split------
------Regression model results
----------Ridge model----------
Default Ridge mean RMSE from 10-fold cross validation was 25.286
Default Ridge RMSE on training data was 25.069
Default Ridge R^2 score was 0.15116
Default Ridge RMSE on testing data was 25.535
Best fit Ridge RMSE on training data was 25.083
Best fit Ridge R^2 score was 0.15021
Best fit Ridge RMSE on testing data was 25.566
Best params used:
{'alpha': 20.0}
----------Lasso model----------
Default Lasso mean RMSE from 10-fold cross validation was 25.961
Default Lasso RMSE on training data was 25.955
Default Lasso R^2 score was 0.09012
Default Lasso RMSE on testing data was 26.578
Best fit Lasso RMSE on training data was 25.069
Best fit Lasso R^2 score was 0.15118
Best fit Lasso RMSE on testing data was 25.533
Best params used:
{'alpha': 0.0001}
----------SGD model----------
Default SGD mean RMSE from 10-fold cross validation was 25.281
Default SGD RMSE on training data was 25.113
Default SGD R^2 score was 0.14816
Default SGD RMSE on testing data was 25.596
Best fit SGD RMSE on training data was 25.107
Best fit SGD R^2 score was 0.14860
Best fit SGD RMSE on testing data was 25.591
Best params used:
{'alpha': 0.0001, 'l1_ratio': 0.7777777777777777}
Results for 47 features
------All popular data after dropping correlated variables, 80/20 random split------
------Regression model results
----------Ridge model----------
Default Ridge mean RMSE from 10-fold cross validation was 25.295
Default Ridge RMSE on training data was 25.067
Default Ridge R^2 score was 0.15129
Default Ridge RMSE on testing data was 25.535
Best fit Ridge RMSE on training data was 25.082
Best fit Ridge R^2 score was 0.15032
Best fit Ridge RMSE on testing data was 25.567
Best params used:
{'alpha': 20.0}
----------Lasso model----------
Default Lasso mean RMSE from 10-fold cross validation was 25.961
Default Lasso RMSE on training data was 25.955
Default Lasso R^2 score was 0.09012
Default Lasso RMSE on testing data was 26.578
Best fit Lasso RMSE on training data was 25.139
Best fit Lasso R^2 score was 0.14641
Best fit Lasso RMSE on testing data was 25.646
Best params used:
{'alpha': 0.05272631578947369}
----------SGD model----------
Default SGD mean RMSE from 10-fold cross validation was 25.305
Default SGD RMSE on training data was 25.094
Default SGD R^2 score was 0.14949
Default SGD RMSE on testing data was 25.564
Best fit SGD RMSE on training data was 25.170
Best fit SGD R^2 score was 0.14435
Best fit SGD RMSE on testing data was 25.719
Best params used:
{'alpha': 0.06676, 'l1_ratio': 1.0}
Results for 48 features
------All popular data after dropping correlated variables, 80/20 random split------
------Regression model results
----------Ridge model----------
Default Ridge mean RMSE from 10-fold cross validation was 25.298
Default Ridge RMSE on training data was 25.065
Default Ridge R^2 score was 0.15146
Default Ridge RMSE on testing data was 25.537
Best fit Ridge RMSE on training data was 25.079
Best fit Ridge R^2 score was 0.15048
Best fit Ridge RMSE on testing data was 25.568
Best params used:
{'alpha': 20.0}
----------Lasso model----------
Default Lasso mean RMSE from 10-fold cross validation was 25.961
Default Lasso RMSE on training data was 25.955
Default Lasso R^2 score was 0.09012
Default Lasso RMSE on testing data was 26.578
Best fit Lasso RMSE on training data was 25.139
Best fit Lasso R^2 score was 0.14641
Best fit Lasso RMSE on testing data was 25.646
Best params used:
{'alpha': 0.05272631578947369}
----------SGD model----------
Default SGD mean RMSE from 10-fold cross validation was 25.299
Default SGD RMSE on training data was 25.111
Default SGD R^2 score was 0.14833
Default SGD RMSE on testing data was 25.600
Best fit SGD RMSE on training data was 25.101
Best fit SGD R^2 score was 0.14899
Best fit SGD RMSE on testing data was 25.549
Best params used:
{'alpha': 0.0001, 'l1_ratio': 1.0}
Results for 49 features
------All popular data after dropping correlated variables, 80/20 random split------
------Regression model results
----------Ridge model----------
Default Ridge mean RMSE from 10-fold cross validation was 25.298
Default Ridge RMSE on training data was 25.062
Default Ridge R^2 score was 0.15165
Default Ridge RMSE on testing data was 25.542
Best fit Ridge RMSE on training data was 25.076
Best fit Ridge R^2 score was 0.15071
Best fit Ridge RMSE on testing data was 25.572
Best params used:
{'alpha': 20.0}
----------Lasso model----------
Default Lasso mean RMSE from 10-fold cross validation was 25.961
Default Lasso RMSE on training data was 25.955
Default Lasso R^2 score was 0.09012
Default Lasso RMSE on testing data was 26.578
Best fit Lasso RMSE on training data was 25.130
Best fit Lasso R^2 score was 0.14701
Best fit Lasso RMSE on testing data was 25.653
Best params used:
{'alpha': 0.05272631578947369}
----------SGD model----------
Default SGD mean RMSE from 10-fold cross validation was 25.319
Default SGD RMSE on training data was 25.088
Default SGD R^2 score was 0.14989
Default SGD RMSE on testing data was 25.578
Best fit SGD RMSE on training data was 25.086
Best fit SGD R^2 score was 0.15001
Best fit SGD RMSE on testing data was 25.570
Best params used:
{'alpha': 0.0001, 'l1_ratio': 0.7777777777777777}
Results for 50 features
------All popular data after dropping correlated variables, 80/20 random split------
------Regression model results
----------Ridge model----------
Default Ridge mean RMSE from 10-fold cross validation was 25.298
Default Ridge RMSE on training data was 25.062
Default Ridge R^2 score was 0.15165
Default Ridge RMSE on testing data was 25.542
Best fit Ridge RMSE on training data was 25.076
Best fit Ridge R^2 score was 0.15072
Best fit Ridge RMSE on testing data was 25.571
Best params used:
{'alpha': 20.0}
----------Lasso model----------
Default Lasso mean RMSE from 10-fold cross validation was 25.961
Default Lasso RMSE on training data was 25.955
Default Lasso R^2 score was 0.09012
Default Lasso RMSE on testing data was 26.578
Best fit Lasso RMSE on training data was 25.130
Best fit Lasso R^2 score was 0.14706
Best fit Lasso RMSE on testing data was 25.653
Best params used:
{'alpha': 0.05272631578947369}
----------SGD model----------
Default SGD mean RMSE from 10-fold cross validation was 25.323
Default SGD RMSE on training data was 25.081
Default SGD R^2 score was 0.15035
Default SGD RMSE on testing data was 25.550
Best fit SGD RMSE on training data was 25.095
Best fit SGD R^2 score was 0.14943
Best fit SGD RMSE on testing data was 25.551
Best params used:
{'alpha': 0.0001, 'l1_ratio': 0.1111111111111111}
Results for 51 features
------All popular data after dropping correlated variables, 80/20 random split------
------Regression model results
----------Ridge model----------
Default Ridge mean RMSE from 10-fold cross validation was 25.305
Default Ridge RMSE on training data was 25.061
Default Ridge R^2 score was 0.15174
Default Ridge RMSE on testing data was 25.537
Best fit Ridge RMSE on training data was 25.074
Best fit Ridge R^2 score was 0.15081
Best fit Ridge RMSE on testing data was 25.569
Best params used:
{'alpha': 20.0}
----------Lasso model----------
Default Lasso mean RMSE from 10-fold cross validation was 25.961
Default Lasso RMSE on training data was 25.955
Default Lasso R^2 score was 0.09012
Default Lasso RMSE on testing data was 26.578
Best fit Lasso RMSE on training data was 25.130
Best fit Lasso R^2 score was 0.14706
Best fit Lasso RMSE on testing data was 25.653
Best params used:
{'alpha': 0.05272631578947369}
----------SGD model----------
Default SGD mean RMSE from 10-fold cross validation was 25.293
Default SGD RMSE on training data was 25.080
Default SGD R^2 score was 0.15043
Default SGD RMSE on testing data was 25.565
Best fit SGD RMSE on training data was 25.087
Best fit SGD R^2 score was 0.14998
Best fit SGD RMSE on testing data was 25.566
Best params used:
{'alpha': 0.0001, 'l1_ratio': 0.2222222222222222}
Results for 52 features
------All popular data after dropping correlated variables, 80/20 random split------
------Regression model results
----------Ridge model----------
Default Ridge mean RMSE from 10-fold cross validation was 25.307
Default Ridge RMSE on training data was 25.053
Default Ridge R^2 score was 0.15225
Default Ridge RMSE on testing data was 25.524
Best fit Ridge RMSE on training data was 25.067
Best fit Ridge R^2 score was 0.15132
Best fit Ridge RMSE on testing data was 25.552
Best params used:
{'alpha': 20.0}
----------Lasso model----------
Default Lasso mean RMSE from 10-fold cross validation was 25.961
Default Lasso RMSE on training data was 25.955
Default Lasso R^2 score was 0.09012
Default Lasso RMSE on testing data was 26.578
Best fit Lasso RMSE on training data was 25.123
Best fit Lasso R^2 score was 0.14749
Best fit Lasso RMSE on testing data was 25.640
Best params used:
{'alpha': 0.05272631578947369}
----------SGD model----------
Default SGD mean RMSE from 10-fold cross validation was 25.327
Default SGD RMSE on training data was 25.076
Default SGD R^2 score was 0.15070
Default SGD RMSE on testing data was 25.554
Best fit SGD RMSE on training data was 25.109
Best fit SGD R^2 score was 0.14844
Best fit SGD RMSE on testing data was 25.543
Best params used:
{'alpha': 0.0001, 'l1_ratio': 0.8888888888888888}
Results for 53 features
------All popular data after dropping correlated variables, 80/20 random split------
------Regression model results
----------Ridge model----------
Default Ridge mean RMSE from 10-fold cross validation was 25.288
Default Ridge RMSE on training data was 25.029
Default Ridge R^2 score was 0.15390
Default Ridge RMSE on testing data was 25.517
Best fit Ridge RMSE on training data was 25.051
Best fit Ridge R^2 score was 0.15239
Best fit Ridge RMSE on testing data was 25.557
Best params used:
{'alpha': 20.0}
----------Lasso model----------
Default Lasso mean RMSE from 10-fold cross validation was 25.961
Default Lasso RMSE on training data was 25.955
Default Lasso R^2 score was 0.09012
Default Lasso RMSE on testing data was 26.578
Best fit Lasso RMSE on training data was 25.116
Best fit Lasso R^2 score was 0.14801
Best fit Lasso RMSE on testing data was 25.644
Best params used:
{'alpha': 0.05272631578947369}
----------SGD model----------
Default SGD mean RMSE from 10-fold cross validation was 25.302
Default SGD RMSE on training data was 25.060
Default SGD R^2 score was 0.15181
Default SGD RMSE on testing data was 25.554
Best fit SGD RMSE on training data was 25.065
Best fit SGD R^2 score was 0.15145
Best fit SGD RMSE on testing data was 25.543
Best params used:
{'alpha': 0.0001, 'l1_ratio': 0.3333333333333333}
Again our models have not improved in any noticeable way after dropping correlated variables. Our 'all' dataset has 54 features, dropping from 53 through 30 had no strong impact on accuracy.
Data is split by genre - country, pop, latin, r&B, jazz - and operated on independently of one another
Overall performace is expected to increase. It is assumed the data is semi-clustered by genre and by splitting it accordingly, fewer outliers will be present. Model performance is expected to be average to good.
for genre in genres:
title='{} all data, 80/20 random split'.format(genre)
data = popGenre[genre]['all']
target = popGenre[genre]['all_target']
popGenre[genre]['models_all'] = constructModels(MS.train_test_split(data, target, test_size=0.2, random_state=17),dataTitle=title)
------country all data, 80/20 random split------
------Regression model results
----------Ridge model----------
Default Ridge mean RMSE from 10-fold cross validation was 26.097
Default Ridge RMSE on training data was 25.210
Default Ridge R^2 score was 0.13972
Default Ridge RMSE on testing data was 25.798
Best fit Ridge RMSE on training data was 25.300
Best fit Ridge R^2 score was 0.13359
Best fit Ridge RMSE on testing data was 25.758
Best params used:
{'alpha': 10.827586206896552}
----------Lasso model----------
Default Lasso mean RMSE from 10-fold cross validation was 27.156
Default Lasso RMSE on training data was 27.180
Default Lasso R^2 score was 0.00000
Default Lasso RMSE on testing data was 27.379
Best fit Lasso RMSE on training data was 25.419
Best fit Lasso R^2 score was 0.12543
Best fit Lasso RMSE on testing data was 25.740
Best params used:
{'alpha': 0.10009}
----------SGD model----------
Default SGD mean RMSE from 10-fold cross validation was 26.104
Default SGD RMSE on training data was 25.343
Default SGD R^2 score was 0.13065
Default SGD RMSE on testing data was 25.777
Best fit SGD RMSE on training data was 25.386
Best fit SGD R^2 score was 0.12765
Best fit SGD RMSE on testing data was 25.857
Best params used:
{'alpha': 0.0001, 'l1_ratio': 0.3103448275862069}
------jazz all data, 80/20 random split------
------Regression model results
----------Ridge model----------
Default Ridge mean RMSE from 10-fold cross validation was 14.157
Default Ridge RMSE on training data was 13.169
Default Ridge R^2 score was 0.13949
Default Ridge RMSE on testing data was 14.715
Best fit Ridge RMSE on training data was 13.393
Best fit Ridge R^2 score was 0.10999
Best fit Ridge RMSE on testing data was 14.331
Best params used:
{'alpha': 19.344827586206897}
----------Lasso model----------
Default Lasso mean RMSE from 10-fold cross validation was 14.201
Default Lasso RMSE on training data was 14.196
Default Lasso R^2 score was 0.00000
Default Lasso RMSE on testing data was 14.418
Best fit Lasso RMSE on training data was 13.462
Best fit Lasso R^2 score was 0.10075
Best fit Lasso RMSE on testing data was 14.020
Best params used:
{'alpha': 0.13341999999999998}
----------SGD model----------
Default SGD mean RMSE from 10-fold cross validation was 14.063
Default SGD RMSE on training data was 13.311
Default SGD R^2 score was 0.12086
Default SGD RMSE on testing data was 14.380
Best fit SGD RMSE on training data was 13.500
Best fit SGD R^2 score was 0.09565
Best fit SGD RMSE on testing data was 14.226
Best params used:
{'alpha': 0.03343, 'l1_ratio': 0.06896551724137931}
------latin all data, 80/20 random split------
------Regression model results
----------Ridge model----------
Default Ridge mean RMSE from 10-fold cross validation was 26.756
Default Ridge RMSE on training data was 25.711
Default Ridge R^2 score was 0.14685
Default Ridge RMSE on testing data was 26.426
Best fit Ridge RMSE on training data was 25.826
Best fit Ridge R^2 score was 0.13913
Best fit Ridge RMSE on testing data was 26.220
Best params used:
{'alpha': 5.586206896551724}
----------Lasso model----------
Default Lasso mean RMSE from 10-fold cross validation was 27.809
Default Lasso RMSE on training data was 27.798
Default Lasso R^2 score was 0.00271
Default Lasso RMSE on testing data was 26.945
Best fit Lasso RMSE on training data was 25.867
Best fit Lasso R^2 score was 0.13643
Best fit Lasso RMSE on testing data was 26.140
Best params used:
{'alpha': 0.06676}
----------SGD model----------
Default SGD mean RMSE from 10-fold cross validation was 26.886
Default SGD RMSE on training data was 26.004
Default SGD R^2 score was 0.12728
Default SGD RMSE on testing data was 26.265
Best fit SGD RMSE on training data was 26.178
Best fit SGD R^2 score was 0.11552
Best fit SGD RMSE on testing data was 26.195
Best params used:
{'alpha': 0.03343, 'l1_ratio': 0.5517241379310345}
------pop all data, 80/20 random split------
------Regression model results
----------Ridge model----------
Default Ridge mean RMSE from 10-fold cross validation was 14.863
Default Ridge RMSE on training data was 13.728
Default Ridge R^2 score was 0.05160
Default Ridge RMSE on testing data was 14.705
Best fit Ridge RMSE on training data was 13.837
Best fit Ridge R^2 score was 0.03650
Best fit Ridge RMSE on testing data was 14.609
Best params used:
{'alpha': 20.0}
----------Lasso model----------
Default Lasso mean RMSE from 10-fold cross validation was 14.105
Default Lasso RMSE on training data was 14.097
Default Lasso R^2 score was 0.00000
Default Lasso RMSE on testing data was 14.599
Best fit Lasso RMSE on training data was 14.097
Best fit Lasso R^2 score was 0.00000
Best fit Lasso RMSE on testing data was 14.599
Best params used:
{'alpha': 0.43338999999999994}
----------SGD model----------
Default SGD mean RMSE from 10-fold cross validation was 14.705
Default SGD RMSE on training data was 13.905
Default SGD R^2 score was 0.02700
Default SGD RMSE on testing data was 14.603
Best fit SGD RMSE on training data was 14.097
Best fit SGD R^2 score was -0.00001
Best fit SGD RMSE on testing data was 14.599
Best params used:
{'alpha': 0.93334, 'l1_ratio': 0.8620689655172413}
------r&b all data, 80/20 random split------
------Regression model results
----------Ridge model----------
Default Ridge mean RMSE from 10-fold cross validation was 28.608
Default Ridge RMSE on training data was 27.778
Default Ridge R^2 score was 0.07259
Default Ridge RMSE on testing data was 29.379
Best fit Ridge RMSE on training data was 27.899
Best fit Ridge R^2 score was 0.06448
Best fit Ridge RMSE on testing data was 29.342
Best params used:
{'alpha': 12.793103448275861}
----------Lasso model----------
Default Lasso mean RMSE from 10-fold cross validation was 28.848
Default Lasso RMSE on training data was 28.840
Default Lasso R^2 score was 0.00031
Default Lasso RMSE on testing data was 29.087
Best fit Lasso RMSE on training data was 28.073
Best fit Lasso R^2 score was 0.05280
Best fit Lasso RMSE on testing data was 29.029
Best params used:
{'alpha': 0.16674999999999998}
----------SGD model----------
Default SGD mean RMSE from 10-fold cross validation was 28.657
Default SGD RMSE on training data was 27.961
Default SGD R^2 score was 0.06033
Default SGD RMSE on testing data was 29.366
Best fit SGD RMSE on training data was 28.043
Best fit SGD R^2 score was 0.05477
Best fit SGD RMSE on testing data was 29.131
Best params used:
{'alpha': 0.10009, 'l1_ratio': 1.0}
Breaking by genre has had disappointingly low results. The genre breakdown does not seem to have a large effect on overall model accuracy, with the exception of R&B which is ~4% worse on average than the all data runs. (note that jazz and pop only track top 50 songs, so the lower values these produce, when considered on the scale of 100 from the others and from all data, is equally as poor).
for genre in genres:
title='{} all data, recent 3 years, 80/20 random split'.format(genre)
data = popGenre[genre]['recent']
target = popGenre[genre]['recent_target']
popGenre[genre]['models_recent'] = constructModels(MS.train_test_split(data, target, test_size=0.2, random_state=17),dataTitle=title)
------country all data, recent 3 years, 80/20 random split------
------Regression model results
----------Ridge model----------
Default Ridge mean RMSE from 10-fold cross validation was 28.914
Default Ridge RMSE on training data was 26.737
Default Ridge R^2 score was 0.11176
Default Ridge RMSE on testing data was 32.714
Best fit Ridge RMSE on training data was 27.305
Best fit Ridge R^2 score was 0.07361
Best fit Ridge RMSE on testing data was 31.461
Best params used:
{'alpha': 20.0}
----------Lasso model----------
Default Lasso mean RMSE from 10-fold cross validation was 28.332
Default Lasso RMSE on training data was 28.031
Default Lasso R^2 score was 0.02372
Default Lasso RMSE on testing data was 30.715
Best fit Lasso RMSE on training data was 27.719
Best fit Lasso R^2 score was 0.04531
Best fit Lasso RMSE on testing data was 31.031
Best params used:
{'alpha': 0.63337}
----------SGD model----------
Default SGD mean RMSE from 10-fold cross validation was 28.663
Default SGD RMSE on training data was 26.993
Default SGD R^2 score was 0.09468
Default SGD RMSE on testing data was 32.679
Best fit SGD RMSE on training data was 27.974
Best fit SGD R^2 score was 0.02766
Best fit SGD RMSE on testing data was 31.349
Best params used:
{'alpha': 0.36673, 'l1_ratio': 0.3793103448275862}
------jazz all data, recent 3 years, 80/20 random split------
------Regression model results
----------Ridge model----------
Default Ridge mean RMSE from 10-fold cross validation was 15.993
Default Ridge RMSE on training data was 13.522
Default Ridge R^2 score was 0.09669
Default Ridge RMSE on testing data was 15.847
Best fit Ridge RMSE on training data was 13.906
Best fit Ridge R^2 score was 0.04465
Best fit Ridge RMSE on testing data was 14.875
Best params used:
{'alpha': 20.0}
----------Lasso model----------
Default Lasso mean RMSE from 10-fold cross validation was 14.273
Default Lasso RMSE on training data was 14.227
Default Lasso R^2 score was 0.00000
Default Lasso RMSE on testing data was 14.554
Best fit Lasso RMSE on training data was 14.227
Best fit Lasso R^2 score was 0.00000
Best fit Lasso RMSE on testing data was 14.554
Best params used:
{'alpha': 0.8666799999999999}
----------SGD model----------
Default SGD mean RMSE from 10-fold cross validation was 15.061
Default SGD RMSE on training data was 13.780
Default SGD R^2 score was 0.06188
Default SGD RMSE on testing data was 15.285
Best fit SGD RMSE on training data was 14.319
Best fit SGD R^2 score was -0.01294
Best fit SGD RMSE on testing data was 15.042
Best params used:
{'alpha': 2.0, 'l1_ratio': 0.20689655172413793}
------latin all data, recent 3 years, 80/20 random split------
------Regression model results
----------Ridge model----------
Default Ridge mean RMSE from 10-fold cross validation was 28.834
Default Ridge RMSE on training data was 25.520
Default Ridge R^2 score was 0.22368
Default Ridge RMSE on testing data was 28.640
Best fit Ridge RMSE on training data was 26.469
Best fit Ridge R^2 score was 0.16483
Best fit Ridge RMSE on testing data was 26.556
Best params used:
{'alpha': 17.379310344827587}
----------Lasso model----------
Default Lasso mean RMSE from 10-fold cross validation was 28.315
Default Lasso RMSE on training data was 27.845
Default Lasso R^2 score was 0.07578
Default Lasso RMSE on testing data was 26.393
Best fit Lasso RMSE on training data was 26.461
Best fit Lasso R^2 score was 0.16534
Best fit Lasso RMSE on testing data was 27.033
Best params used:
{'alpha': 0.40005999999999997}
----------SGD model----------
Default SGD mean RMSE from 10-fold cross validation was 28.531
Default SGD RMSE on training data was 26.091
Default SGD R^2 score was 0.18857
Default SGD RMSE on testing data was 27.242
Best fit SGD RMSE on training data was 27.213
Best fit SGD R^2 score was 0.11723
Best fit SGD RMSE on testing data was 26.634
Best params used:
{'alpha': 0.7000299999999999, 'l1_ratio': 1.0}
------pop all data, recent 3 years, 80/20 random split------
------Regression model results
----------Ridge model----------
Default Ridge mean RMSE from 10-fold cross validation was 15.933
Default Ridge RMSE on training data was 13.513
Default Ridge R^2 score was 0.12605
Default Ridge RMSE on testing data was 15.616
Best fit Ridge RMSE on training data was 14.003
Best fit Ridge R^2 score was 0.06157
Best fit Ridge RMSE on testing data was 15.206
Best params used:
{'alpha': 20.0}
----------Lasso model----------
Default Lasso mean RMSE from 10-fold cross validation was 14.326
Default Lasso RMSE on training data was 14.455
Default Lasso R^2 score was 0.00000
Default Lasso RMSE on testing data was 14.975
Best fit Lasso RMSE on training data was 14.455
Best fit Lasso R^2 score was 0.00000
Best fit Lasso RMSE on testing data was 14.975
Best params used:
{'alpha': 1.6666666666666665}
----------SGD model----------
Default SGD mean RMSE from 10-fold cross validation was 15.141
Default SGD RMSE on training data was 13.748
Default SGD R^2 score was 0.09539
Default SGD RMSE on testing data was 15.942
Best fit SGD RMSE on training data was 14.670
Best fit SGD R^2 score was -0.03001
Best fit SGD RMSE on testing data was 15.631
Best params used:
{'alpha': 2.0, 'l1_ratio': 0.48275862068965514}
------r&b all data, recent 3 years, 80/20 random split------
------Regression model results
----------Ridge model----------
Default Ridge mean RMSE from 10-fold cross validation was 30.534
Default Ridge RMSE on training data was 27.352
Default Ridge R^2 score was 0.12440
Default Ridge RMSE on testing data was 31.407
Best fit Ridge RMSE on training data was 28.188
Best fit Ridge R^2 score was 0.07001
Best fit Ridge RMSE on testing data was 29.526
Best params used:
{'alpha': 20.0}
----------Lasso model----------
Default Lasso mean RMSE from 10-fold cross validation was 29.506
Default Lasso RMSE on training data was 29.107
Default Lasso R^2 score was 0.00840
Default Lasso RMSE on testing data was 28.528
Best fit Lasso RMSE on training data was 29.230
Best fit Lasso R^2 score was 0.00000
Best fit Lasso RMSE on testing data was 28.434
Best params used:
{'alpha': 2.0}
----------SGD model----------
Default SGD mean RMSE from 10-fold cross validation was 29.883
Default SGD RMSE on training data was 27.958
Default SGD R^2 score was 0.08517
Default SGD RMSE on testing data was 30.396
Best fit SGD RMSE on training data was 29.024
Best fit SGD R^2 score was 0.01404
Best fit SGD RMSE on testing data was 29.146
Best params used:
{'alpha': 0.8333499999999999, 'l1_ratio': 0.20689655172413793}
Surprisingly, using only the last 3 years of data has produced significantly worse results than any other data breakdown, with the average being around 1-2% worse, but country producing a model 6% worse on average than using all data.
Splitting on the last 3 years of data was done based on the assumption that music trends change with time, and using all historical data could potentially throw off the model. It seems the general trends are stronger when looking at the data as a whole than just the past 3 years.
Regardless of parameters or data, the models produced consistently performed poorly. The best RMSE obtained was around 25% of the range of possible values, regardless of data breakdown or variant, with worse models ending up with over 30%.
Splitting by genre, while initially assumed to be the best option, produced, at best, equivalent results to no breakdown at all, and at worst, the worst models made overall.
The results here agree with most of the clustering and classification results observed, which is to say, there does not appear any strong trends between our data and song popularity.